Loading packages

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
#library(readr)
library(leaflet)
#library(mapboxapi)
#library(htmlwidgets)
#library(osmdata)
library(sf)
## Warning: package 'sf' was built under R version 4.1.2
## Linking to GEOS 3.9.1, GDAL 3.4.0, PROJ 8.1.1; sf_use_s2() is TRUE
#library(ggmap)
#library(magrittr)
#library(spatstat)
library(sp)
library(htmlwidgets)
library(webshot)
library(maps)
## 
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
## 
##     map

Loading 2019 data

file_name <- "data/bluebikes_tripdata_2019.csv"
# read:
data_2019 <- read_csv(file_name, progress = FALSE)
## Rows: 2522771 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (3): start station name, end station name, usertype
## dbl  (12): tripduration, start station id, start station latitude, start sta...
## dttm  (2): starttime, stoptime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2019 <- data_2019 %>% rename("start_long" = "start station longitude",
                                      "start_lat" = "start station latitude",
                                      "end_long" = "end station longitude",
                                      "end_lat" = "end station latitude",
                                      "start_station_id" = "start station id",
                                      "start_station_name" = "start station name",
                                      "end_station_name" = "end station name",
                                      "birth_year" = "birth year")

Basic data explorations

Exploring basic patterns in the data that might be interesting for further exploration

# subscribers vs. customers (single use)
data_2019 %>% filter(usertype == "Subscriber") %>% count() # returns 1988494
## # A tibble: 1 × 1
##         n
##     <int>
## 1 1988494
data_2019 %>% filter(usertype == "Customer") %>% count() # returns 534277
## # A tibble: 1 × 1
##        n
##    <int>
## 1 534277
# genders 
data_2019 %>% filter(gender == 0) %>% count() # 277703 (female?)
## # A tibble: 1 × 1
##        n
##    <int>
## 1 277703
data_2019 %>% filter(gender == 1) %>% count() # 1652699 (male?)
## # A tibble: 1 × 1
##         n
##     <int>
## 1 1652699
data_2019 %>% filter(gender == 2) %>% count() # 592369 (other?)
## # A tibble: 1 × 1
##        n
##    <int>
## 1 592369
# age range
data_2019 %>% select(birth_year) %>% range() # return 1886 2003
## [1] 1886 2003
data_2019 %>% filter(birth_year == 1886) %>% count() # returns 3
## # A tibble: 1 × 1
##       n
##   <int>
## 1     3
# trip duration range
data_2019 %>% select(tripduration) %>% range() # 61 42567137
## [1]       61 42567137

There are many more subscribers which suggests that it’s mostly locals using the bikes.

Maybe it’s just me but it seems like there’s something off about the gender data, so I might not include that.

There is also something weird about the age data - there are three entries of 1888 and just in general quite old people so again, I might not use this information.

There is also a massive range in trip duration but that’s not necessarily an error

On the basis of this exploration I select the columns I want to use to reduce the strain on my computer when working with such large files

Cleaning the data prior to analysis

cleaned_2019 <- data_2019 %>% select(c(tripduration, starttime, stoptime, start_lat, start_long, start_station_name, end_lat, end_long, end_station_name, usertype, year, month))

cleaned_2019
## # A tibble: 2,522,771 × 12
##    tripduration starttime           stoptime            start_lat start_long
##           <dbl> <dttm>              <dttm>                  <dbl>      <dbl>
##  1          790 2019-12-01 00:01:25 2019-12-01 00:14:35      42.4      -71.1
##  2          166 2019-12-01 00:05:42 2019-12-01 00:08:29      42.4      -71.1
##  3          323 2019-12-01 00:08:28 2019-12-01 00:13:52      42.4      -71.1
##  4          709 2019-12-01 00:08:38 2019-12-01 00:20:27      42.4      -71.1
##  5          332 2019-12-01 00:10:08 2019-12-01 00:15:41      42.4      -71.1
##  6          507 2019-12-01 00:14:26 2019-12-01 00:22:53      42.4      -71.1
##  7          794 2019-12-01 00:17:46 2019-12-01 00:31:01      42.4      -71.1
##  8         1354 2019-12-01 00:18:19 2019-12-01 00:40:54      42.4      -71.1
##  9         1227 2019-12-01 00:19:01 2019-12-01 00:39:29      42.3      -71.1
## 10         1068 2019-12-01 00:24:41 2019-12-01 00:42:29      42.3      -71.1
## # … with 2,522,761 more rows, and 7 more variables: start_station_name <chr>,
## #   end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## #   year <dbl>, month <dbl>

Let’s make a map and plot the end and start stations as points. Let’s start by plotting each start station. I only want one entry pr. start station, so I need to find the unique stations

unique_start <- cleaned_2019 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
# there was a single outlier, so I remove it
unique_start <- unique_start[!(unique_start$start_long == 0.0000|unique_start$start_lat == 0.0000), ]

# other outliers
unique_start <- unique_start %>% filter(start_station_name != "BCBS Hingham")
unique_start <- unique_start %>% filter(start_station_name != "Main St at Beacon St")

I do the same for the end stations

unique_end <- cleaned_2019 %>% select(c(end_long, end_lat, end_station_name)) %>% unique()
# I do the same for the end stations
unique_end <- unique_end[!(unique_end$end_long == 0.0000|unique_end$end_lat == 0.000), ]
# another outlier
unique_end <- unique_end %>% filter(end_station_name != "BCBS Hingham")

Spatial analysis

Basic maps

Make two quick maps

m1 <- leaflet() %>% addTiles() %>%
  setView(lng = -71.057083, lat = 42.361145, zoom = 12) %>% 
  addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat)
m1
m2 <- leaflet(unique_end) %>% addTiles() %>% setView(lng = -71.057083, lat = 42.361145, zoom = 12) %>% addCircleMarkers(lng = unique_end$end_long, lat = unique_end$end_lat)
m2
#saveWidget(m2, "out/2019_end_stations.html", selfcontained = FALSE)

The two maps look almost identical.

Figuring out where people are going

load Boston city boundary and neighborhoods

# neighborhoods
neighborhoods <- st_read("data/Boston_Neighborhoods/Boston_Neighborhoods.shp")
## Reading layer `Boston_Neighborhoods' from data source 
##   `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Boston_Neighborhoods/Boston_Neighborhoods.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 26 features and 7 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 739715.3 ymin: 2908294 xmax: 812255.1 ymax: 2970086
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
# reproject to get it on the map
projected_neighborhoods <- st_transform(neighborhoods, 4326)

Plot the neighborhoods and stations together start stations and neighborhoods

p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 

p2_popup <- paste0("<strong>Start station: </strong>", unique_start$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>% 
  addCircleMarkers(lng = unique_start$start_long,
                   lat = unique_start$start_lat,
                   popup = p2_popup)

end stations and neighborhoods

p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 

p3_popup <- paste0("<strong>End station: </strong>", unique_end$end_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>% 
  addCircleMarkers(lng = unique_end$end_long,
                   lat = unique_end$end_lat,
                   popup = p2_popup)

From these maps, we can get an idea of where in Boston are moving, both within the larger city boundary and within neighbors of the city. Now I want to use st_intersect or something on the neighborhoods and the points to find the most popular areas.

One thing to consider in the analysis of the most popular areas is that there are very small neighborhoods in central Boston and very large neighborhoods on the outskirts of the city. It is therefore difficult for a small neighborhood to have the same amount of bike stations as larger neighborhoods. However, on the other hand, from just looking at the maps, the central parts of the city are clearly the most popular, so perhaps this is not an issue.

To get the intersections of the points and the neighborhoods, I create a convex hull around the points. First I transform the longitude and latitude coordinates into a sf object.

# remove NAs if there is any
points <- unique_start %>% 
  filter(! is.na(start_long)) %>% 
  filter(! is.na(start_lat))

# transform points to sf object
points_sf <- st_as_sf(points, coords = c("start_long", "start_lat"), crs = 4326)
points_sf
## Simple feature collection with 406 features and 1 field
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -71.16649 ymin: 42.2679 xmax: -71.0061 ymax: 42.4148
## Geodetic CRS:  WGS 84
## # A tibble: 406 × 2
##    start_station_name                                           geometry
##  * <chr>                                                     <POINT [°]>
##  1 Dartmouth St at Newbury St                       (-71.07783 42.35096)
##  2 MIT Stata Center at Vassar St / Main St          (-71.09116 42.36213)
##  3 Inman Square at Springfield St.                  (-71.10016 42.37438)
##  4 Third at Binney                                  (-71.08277 42.36544)
##  5 Verizon Innovation Hub 10 Ware Street            (-71.11305 42.37251)
##  6 University Park                                  (-71.10006 42.36265)
##  7 One Kendall Square at Hampshire St / Portland St (-71.09169 42.36628)
##  8 359 Broadway - Broadway at Fayette Street         (-71.10441 42.3708)
##  9 Prudential Center - 101 Huntington Ave           (-71.08066 42.34652)
## 10 Encore                                           (-71.07245 42.39329)
## # … with 396 more rows
# transform its crs 
points_3035 <- st_transform(points_sf, crs = 3035)
points_3035
## Simple feature collection with 406 features and 1 field
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -902380.1 ymin: 5513497 xmax: -885856.2 ymax: 5529263
## Projected CRS: ETRS89-extended / LAEA Europe
## # A tibble: 406 × 2
##    start_station_name                                          geometry
##  * <chr>                                                    <POINT [m]>
##  1 Dartmouth St at Newbury St                         (-892912 5521013)
##  2 MIT Stata Center at Vassar St / Main St          (-892186.8 5522717)
##  3 Inman Square at Springfield St.                  (-891238.4 5524152)
##  4 Third at Binney                                  (-891637.1 5522270)
##  5 Verizon Innovation Hub 10 Ware Street            (-891770.1 5525035)
##  6 University Park                                  (-892377.8 5523436)
##  7 One Kendall Square at Hampshire St / Portland St (-891797.9 5523009)
##  8 359 Broadway - Broadway at Fayette Street        (-891702.1 5524264)
##  9 Prudential Center - 101 Huntington Ave           (-893420.9 5520963)
## 10 Encore                                           (-888647.6 5523156)
## # … with 396 more rows
# create convex hull
points_ch <- st_convex_hull(st_union(points_3035))
points_ch
## Geometry set for 1 feature 
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -902380.1 ymin: 5513497 xmax: -885856.2 ymax: 5529263
## Projected CRS: ETRS89-extended / LAEA Europe
## POLYGON ((-899151.8 5513497, -901419.5 5517212,...
plot(points_ch)

class(points_ch)
## [1] "sfc_POLYGON" "sfc"

I have now created a polygon out of the start station points.

Let’s get the intersection between this polygon and the neighborhoods multipolygon

# make sure they have the same crs
neighborhoods_3035 <- st_transform(neighborhoods, crs = 3035)

# get the intersections
points_neighborhoods_intersection <- st_intersection(points_ch, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff <- st_union(points_neighborhoods_intersection)

Plot the objects together

plot(points_ch)
plot(points_neighborhoods_intersection, border='red', lwd=2, add = TRUE)
plot(p_n_buff, add = TRUE)

The plot shows that the neighborhoods occupy a significant amount of space within the points polygon. However, there is also a large area that the neighborhoods do not cover. Let’s look at the map of the start stations and neighborhoods again to compare

p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 

p2_popup <- paste0("<strong>Start station: </strong>", unique_start$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>% 
  addCircleMarkers(lng = unique_start$start_long,
                   lat = unique_start$start_lat,
                   popup = p2_popup)

Looking at the map, we can confirm that towards the northern part of the city, the points occupy a space outside the neighborhoods. Furthermore, recall the points polygon that went down into a point, which is due to the single point located far outside the city to the south.

I do the same for the end stations to see if we find the same patterns.

# remove NAs if there is any
points_end <- unique_end %>% 
  filter(! is.na(end_long)) %>% 
  filter(! is.na(end_lat))
# transform the points into an sf object 
points_end_sf <- st_as_sf(points_end, coords = c("end_long", "end_lat"), crs = 4326)
# transform their crs
points_end_3035 <- st_transform(points_end_sf, crs = 3035)
# create convex hull
points_end_ch <- st_convex_hull(st_union(points_end_3035))

# plot it
plot(points_end_ch)

class(points_end_ch)
## [1] "sfc_POLYGON" "sfc"

The polygon looks very much identical to the start stations polyogon

Let’s get the intersection

# get the intersections
points_neighborhoods_intersection_end <- st_intersection(points_end_ch, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff_end <- st_union(points_neighborhoods_intersection_end)

Plot the objects together

plot(points_end_ch)
plot(points_neighborhoods_intersection_end, border='red', lwd=2, add = TRUE)
plot(p_n_buff_end, add = TRUE)

It looks very much the same for the end stations. Let’s look at the map of the end stations and the neighborhoods to compare

p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 

p3_popup <- paste0("<strong>End station: </strong>", unique_end$end_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>% 
  addCircleMarkers(lng = unique_end$end_long,
                   lat = unique_end$end_lat,
                   popup = p2_popup)

Once again, the map confirms what we saw in the plot. There is an area beyond the city boundary towards the north where there are a lot of points, and there is a single point towards the south far outside the city, which gives the polygon its point.

From now on, when I just want to plot the stations, I just plot the start stations because the end stations are very similar.

Loading the 2020 data

Let’s look at the 2020 data

file_name <- "data/bluebikes_tripdata_2020.csv"
# read:
data_2020 <- read_csv(file_name, progress = FALSE)
## Rows: 1999446 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (4): start station name, end station name, usertype, postal code
## dbl  (12): tripduration, start station id, start station latitude, start sta...
## dttm  (2): starttime, stoptime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_2020 <- data_2020 %>% rename("start_long" = "start station longitude",
                                      "start_lat" = "start station latitude",
                                      "end_long" = "end station longitude",
                                      "end_lat" = "end station latitude",
                                      "start_station_id" = "start station id",
                                      "start_station_name" = "start station name",
                                      "end_station_name" = "end station name",
                                      "birth_year" = "birth year")

Basic data explorations

Let’s do the same initial exploration of the data so see what we’re working with

# subscribers vs. customers (single use)
data_2020 %>% filter(usertype == "Subscriber") %>% count() # returns 1440363
## # A tibble: 1 × 1
##         n
##     <int>
## 1 1440363
data_2020 %>% filter(usertype == "Customer") %>% count() # returns 559083
## # A tibble: 1 × 1
##        n
##    <int>
## 1 559083
# much fewer subscribers much around the same single use 

# genders 
data_2020 %>% filter(gender == 0) %>% count() # 29691 (female?)
## # A tibble: 1 × 1
##       n
##   <int>
## 1 29691
data_2020 %>% filter(gender == 1) %>% count() # 291321 (male?)
## # A tibble: 1 × 1
##        n
##    <int>
## 1 291321
data_2020 %>% filter(gender == 2) %>% count() # 94964 (other?)
## # A tibble: 1 × 1
##       n
##   <int>
## 1 94964
# much more uniform data here

data_2020 %>% select(birth_year) %>% range() # there is no birth year information
## [1] NA NA
# trip duration range
data_2020 %>% select(tripduration) %>% range() # 61 3879352
## [1]      61 3879352
# there is also fewer rows in the 2020 dataset

Clearning the data prior to analysis

Let’s clean the 2020 data

cleaned_2020 <- data_2020 %>% select(c(tripduration, starttime, stoptime, start_lat, start_long, start_station_name, end_lat, end_long, end_station_name, usertype, year, month))

cleaned_2020
## # A tibble: 1,999,446 × 12
##    tripduration starttime           stoptime            start_lat start_long
##           <dbl> <dttm>              <dttm>                  <dbl>      <dbl>
##  1         1793 2020-11-01 00:00:18 2020-11-01 00:30:12      42.3      -71.0
##  2         1832 2020-11-01 00:00:34 2020-11-01 00:31:07      42.3      -71.0
##  3          262 2020-11-01 00:01:54 2020-11-01 00:06:17      42.3      -71.0
##  4          419 2020-11-01 00:04:00 2020-11-01 00:10:59      42.4      -71.1
##  5          275 2020-11-01 00:04:01 2020-11-01 00:08:36      42.4      -71.1
##  6          684 2020-11-01 00:04:39 2020-11-01 00:16:03      42.3      -71.1
##  7          608 2020-11-01 00:04:43 2020-11-01 00:14:52      42.4      -71.1
##  8          438 2020-11-01 00:05:57 2020-11-01 00:13:15      42.4      -71.1
##  9          541 2020-11-01 00:07:15 2020-11-01 00:16:16      42.4      -71.1
## 10         1138 2020-11-01 00:07:17 2020-11-01 00:26:16      42.4      -71.1
## # … with 1,999,436 more rows, and 7 more variables: start_station_name <chr>,
## #   end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## #   year <dbl>, month <dbl>

Spatial Analysis

Figuring out where people are going and comparison with the 2019 data

I’ll only look at the start stations

unique_start_2020 <- cleaned_2020 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
# there was a single outlier, so I remove it
unique_start_2020 <- unique_start_2020[!(unique_start_2020$start_long == 0.0000|unique_start_2020$start_lat == 0.0000), ]

# another outlier
unique_start_2020 <- unique_start_2020 %>% filter(start_station_name != "BCBS Hingham")

Let’s plot the start stations with the neighborhoods

# plot the 2020 start stations
p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 

p5_popup <- paste0("<strong>Start station: </strong>", unique_start_2020$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>% 
  addCircleMarkers(lng = unique_start_2020$start_long,
                   lat = unique_start_2020$start_lat,
                   popup = p5_popup)

let’s look at the 2019 data for comparison

p2_popup <- paste0("<strong>Start station: </strong>", unique_start$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.057083, lat = 42.361145, zoom = 11.5) %>% 
  addCircleMarkers(lng = unique_start$start_long,
                   lat = unique_start$start_lat,
                   popup = p2_popup)

We can see from the maps that there are areas/stations that aren’t in the 2019 data which are also outside of the city boundary. These are stations towards the west of the city boundary, beyond Brighton.

Let’s look at the intersection between the points and neighborhoods in 2020

# remove NAs if there is any
points_2020 <- unique_start_2020 %>% 
  filter(! is.na(start_long)) %>% 
  filter(! is.na(start_lat))
# transform the points into an sf object 
points_2020_sf <- st_as_sf(points_2020, coords = c("start_long", "start_lat"), crs = 4326)
# transform their crs
points_2020_3035 <- st_transform(points_2020_sf, crs = 3035)
# create convex hull
points_2020_ch <- st_convex_hull(st_union(points_2020_3035))

# plot it
plot(points_2020_ch)

class(points_end_ch)
## [1] "sfc_POLYGON" "sfc"
# get the intersections
points_neighborhoods_intersection_2020 <- st_intersection(points_2020_ch, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff_2020 <- st_union(points_neighborhoods_intersection_2020)

Plot the objects together

plot(points_2020_ch)
plot(points_neighborhoods_intersection_2020, border='red', lwd=2, add = TRUE)
plot(p_n_buff_2020, add = TRUE)

unique_start %>% select(start_station_name) %>% unique() %>%  count() # 361
## # A tibble: 1 × 1
##       n
##   <int>
## 1   359
unique_start_2020 %>% select(start_station_name) %>% unique() %>% count() # 385 
## # A tibble: 1 × 1
##       n
##   <int>
## 1   384

The code above shows that there are around the same unique start station names, with 2020 having 24 more unique start stations. On the one hand, it is surprising that there is a wider range of start stations in 2020, since most people in the world lived in highly restricted way due to Covid-19. However, at least in Denmark, this also led to people going outside with more intention, since the walk or bike ride to work was not a part of our daily lives anymore, and this pattern could perhaps also reflect a similar tendency.

Let’s find the most used start station in 2019 and 2020

# got the function from here: https://stackoverflow.com/questions/66787325/how-to-find-the-least-frequent-value-in-a-column-of-dataframe-in-r

Modemin <- function(x){
  a = table(x) # x is a column
  return(a[which.min(a)])
}
# least used start station in 2020: MTL-ECO4-01 (1 time)
Modemin(data_2020$start_station_name)
## MTL-ECO4-01 
##           1
# least used start station in 2019: 8D QC Station 01 (1 time)
Modemin(data_2019$start_station_name)
## 8D QC Station 01 
##                1
# lest used end station in 2020: Mobile Temporary Station 1 (1 time)
Modemin(data_2020$end_station_name)
## Mobile Temporary Station 1 
##                          1
# lest used end station in 2019: 8D QC Station 02 (1 time)
Modemin(data_2019$end_station_name)
## 8D QC Station 02 
##                1
Modemax <- function(x){
  a = table(x) # x is a column
  return(a[which.max(a)])
}
# most used start station in 2019: MIT at Mass Ave / Amherst St (61056 times)
Modemax(data_2019$start_station_name)
## MIT at Mass Ave / Amherst St 
##                        61056
# most used start station in 2020: Central Square at Mass Ave / Essex St (32668 times )
Modemax(data_2020$start_station_name)
## Central Square at Mass Ave / Essex St 
##                                 32668
# most used end station in 2019: MIT at Mass Ave / Amherst St (56986 times)
Modemax(data_2019$end_station_name)
## MIT at Mass Ave / Amherst St 
##                        56986
# most used end station in 2020: Central Square at Mass Ave / Essex St (33493 times)
Modemax(data_2020$end_station_name)
## Central Square at Mass Ave / Essex St 
##                                 33493

So the most used stations in 2020 are used significantly less than the most used station in 2019. Interstingly, the most used start and end stations in 2019 is the same and the most used start and end station in 2020 is also the same.

Let’s find out where these stations are and plot them with leaflet

# most used station in 2019
data_2019 %>% filter(start_station_name == "MIT at Mass Ave / Amherst St") %>% select(c(start_long, start_lat))
## # A tibble: 61,056 × 2
##    start_long start_lat
##         <dbl>     <dbl>
##  1      -71.1      42.4
##  2      -71.1      42.4
##  3      -71.1      42.4
##  4      -71.1      42.4
##  5      -71.1      42.4
##  6      -71.1      42.4
##  7      -71.1      42.4
##  8      -71.1      42.4
##  9      -71.1      42.4
## 10      -71.1      42.4
## # … with 61,046 more rows
leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.1, lat = 42.4, zoom = 12) %>% addCircleMarkers(lng = -71.1, lat = 42.4) %>% 
  addPolygons(stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup)
# most used station in 2020
data_2020 %>% filter(start_station_name == "Central Square at Mass Ave / Essex St") %>% select(c(start_long, start_lat))
## # A tibble: 32,668 × 2
##    start_long start_lat
##         <dbl>     <dbl>
##  1      -71.1      42.4
##  2      -71.1      42.4
##  3      -71.1      42.4
##  4      -71.1      42.4
##  5      -71.1      42.4
##  6      -71.1      42.4
##  7      -71.1      42.4
##  8      -71.1      42.4
##  9      -71.1      42.4
## 10      -71.1      42.4
## # … with 32,658 more rows

As it turn out the two stations have the same longitude and latitude coordinates. Interestingly, the most used station is far outside the city boundary. From the station name in the 2019 data, it seems that this might be a station close to MIT, so let’s find MIT’s coordinates and plot it along side the point.

p_popup = paste0("<strong>Start station: </strong>", "MIT")
p_popup2 = paste0("<strong>institution: </strong>", "Central Square at Mass Ave / Essex St")

leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.1, lat = 42.4, zoom = 12) %>% addCircleMarkers(lng = -71.1, lat = 42.4, popup = p_popup2) %>% 
  addCircleMarkers(lng = -71.092003, lat = 42.360001, popup = p_popup, color = "red") %>% 
  addPolygons(stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5)

So the central campus of MIT is not near the most used stations in 2019 and 2020, but it makes we think that plotting the different higher learning institutions located in Boston might yield interesting results seeing as many of the stations are clustered around this area.

Load colleges and universities

uni <- st_read("data/Colleges_and_Universities/Colleges_and_Universities.shp")
## Reading layer `Colleges_and_Universities' from data source 
##   `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Colleges_and_Universities/Colleges_and_Universities.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 60 features and 28 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 742669.8 ymin: 2917826 xmax: 780798.5 ymax: 2964159
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
# reproject to get it on the map
projected_uni <- st_transform(uni, 4326)

projected_uni <- projected_uni[!(projected_uni$Longitude == 0.0000|projected_uni$Latitude == 0.0000), ]

Plot the colleges and universities on a map together with the start stations from 2019 and 2020

p7_popup <- paste0("<strong>Institution: </strong>", projected_uni$Name)

leaflet() %>% addTiles() %>% 
  addCircleMarkers(lng = projected_uni$Longitude, lat = projected_uni$Latitude, color = "red", popup = p7_popup) %>% 
  addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)
leaflet() %>% addTiles() %>% 
  addCircleMarkers(lng = projected_uni$Longitude, lat = projected_uni$Latitude, color = "red", popup = p7_popup) %>% 
  addCircleMarkers(lng = unique_start_2020$start_long, lat = unique_start_2020$start_lat) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)

It does in fact seem that many of the start station points are centered around the location of the colleges and universities.

Let’s make the colleges and universities into a polygon and plot it again

# remove NAs if there is any
points_uni <- projected_uni %>% 
  filter(! is.na(Longitude)) %>% 
  filter(! is.na(Latitude))
# transform points into sf object
points_uni_sf <- st_as_sf(points_uni, coords = c("Longitude", "Latitude"), crs = 4326)
# transform its crs 
points_uni_3035 <- st_transform(points_uni_sf, crs = 3035)
# create convex hull
points_uni_ch <- st_convex_hull(st_union(points_uni_3035))
# plot it
plot(points_uni_ch)

# give it projected crs again
points_uni_ch_3035 <- st_transform(points_uni_ch, crs = 4326)

Let’s plot the colleges and universities polygon together the with 2019 start stations

leaflet(points_uni_ch_3035) %>% addTiles() %>% 
  addPolygons(color = "yellow") %>% 
  addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)
leaflet(points_uni_ch_3035) %>% addTiles() %>% 
  addPolygons(color = "yellow") %>% 
  addCircleMarkers(lng = unique_start_2020$start_long, lat = unique_start_2020$start_lat) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)

So indeed, I does seem that the colleges and institutions take up a fairly large part of the space the points occupy. Let’s confirm this be getting the intersection between the points-polygon and the colleges and universities-polygon

# get the intersections
uni_points_intersection <- st_intersection(st_union(points_ch, points_uni_ch))

# create buffer feature of the neighborhoods
p_n_buff_uni <- st_union(uni_points_intersection)

Plot the objects together

plot(points_ch)
plot(uni_points_intersection, add = TRUE)
plot(points_uni_ch, border = "red", add = TRUE)

Compare with the neighborhoods and points intersection

plot(points_ch)
plot(points_neighborhoods_intersection, border='red', lwd=2, add = TRUE)
plot(p_n_buff, add = TRUE)

Indeed, the intersection of the colleges and universities occupy around the same amount of space as the neighborhoods. However, they also occupy the same space within the points-polygon, and I therefore find the intersection between the neighborhoods and the colleges and universities

neighborhoods_uni_intersection <- st_intersection(points_uni_ch, neighborhoods_3035)

plot(points_uni_ch)
plot(neighborhoods_uni_intersection, border="red", lwd=2, add = TRUE)
plot(p_n_buff, add = TRUE)

So, the neighborhoods and the learning institutions occupy much of the same space. Therefore, it is not possible to reliably confirm that the the users of the bikes are primarily seeking out the learning institutions. However, the overlay of the institutions on the points, did capture a real pattern of use just like the neighborhoods did.

Let’s look at the map with the most used start/end station again

p_popup = paste0("<strong>Start station: </strong>", "MIT")
p_popup2 = paste0("<strong>institution: </strong>", "Central Square at Mass Ave / Essex St")

# with slight changes to the view setting
leaflet(projected_neighborhoods) %>% addTiles() %>% setView(lng = -71.1, lat = 42.4, zoom = 15.5) %>% addCircleMarkers(lng = -71.1, lat = 42.4, popup = p_popup2) %>% 
  addPolygons(stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5)

Here I overlay data from public and non-public schools

public <- st_read("data/Public_Schools/Public_Schools.shp")
## Reading layer `Public_Schools' from data source 
##   `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Public_Schools/Public_Schools.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 131 features and 16 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 744291.9 ymin: 2910470 xmax: 790128.2 ymax: 2968120
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
public <- st_transform(public, 4326)
public <- st_as_sf(public, crs = 4326)
public <- st_convex_hull(public$geometry)
# extract coordinates
public_coordinates <- st_coordinates(public)
class(public_coordinates)
## [1] "matrix" "array"
# transform to data frame
public_df <- public_coordinates %>% 
  as_tibble()


# I do the same for the non-public schools
non_public <- st_read("data/Non_Public_Schools/Non_Public_Schools.shp")
## Reading layer `Non_Public_Schools' from data source 
##   `/Users/mettehejberg/Documents/6. semester engelsk /Spatial Analytics/exam/data/Non_Public_Schools/Non_Public_Schools.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 82 features and 14 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 746614.7 ymin: 2912741 xmax: 791398.7 ymax: 2965487
## Projected CRS: NAD83 / Massachusetts Mainland (ftUS)
non_public <- st_transform(non_public, 4326)
non_public <- st_as_sf(non_public, crs = 4326)
non_public <- st_convex_hull(non_public$geometry)
# extract coordinates
non_public_coordinates <- st_coordinates(non_public)
class(non_public_coordinates)
## [1] "matrix" "array"
# transform to data frame
non_public_df <- non_public_coordinates %>% 
  as_tibble()

non_public_df <- non_public_coordinates %>% as_tibble()

# help if they need to be sf objects again: https://stackoverflow.com/questions/44214487/create-sf-object-from-two-column-matrix

Now we can plot the schools at points on the map alongside the start stations

p_popup <- paste0("Public")
p_popup2 <- paste0("Non-public")

leaflet() %>% addTiles() %>% 
  addCircleMarkers(lng = unique_start$start_long, lat = unique_start$start_lat) %>% 
  addCircleMarkers(lng = public_df$X, lat = public_df$Y, color = "red", popup = p_popup) %>% 
  addCircleMarkers(lng = non_public_df$X, lat = non_public_df$Y, color = "yellow", popup = p_popup2) %>% setView(lng = -71.1, lat = 42.35, zoom = 11.5)

This plot seems to reflect a usage pattern dominat in the lower part of the city. There are many overlaps of the points. Let’s transform the schools into a polygon and plot again. I do this by finding the collected shared space between the points

non_public_sf <-  st_as_sf(non_public_df, coords = c("X", "Y"), crs = 4326)
class(non_public_sf)
## [1] "sf"         "tbl_df"     "tbl"        "data.frame"
non_public_sf
## Simple feature collection with 82 features and 0 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -71.16578 ymin: 42.24013 xmax: -70.99945 ymax: 42.38436
## Geodetic CRS:  WGS 84
## # A tibble: 82 × 1
##                geometry
##  *          <POINT [°]>
##  1 (-71.10514 42.32767)
##  2 (-71.05872 42.30139)
##  3 (-71.13015 42.30658)
##  4 (-71.09574 42.34934)
##  5 (-71.13213 42.24426)
##  6 (-71.06087 42.32163)
##  7 (-71.10991 42.26222)
##  8 (-71.12682 42.25037)
##  9 (-71.07193 42.28944)
## 10   (-71.122 42.28169)
## # … with 72 more rows
non_public_sf <- st_transform(non_public_sf, 3035)
non_public_sf
## Simple feature collection with 82 features and 0 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: -905296.7 ymin: 5514781 xmax: -887533.8 ymax: 5526907
## Projected CRS: ETRS89-extended / LAEA Europe
## # A tibble: 82 × 1
##               geometry
##  *         <POINT [m]>
##  1   (-895919 5521715)
##  2 (-897213.5 5516538)
##  3 (-898647.9 5522373)
##  4 (-893555.8 5522298)
##  5 (-904763.7 5518759)
##  6 (-895304.3 5517929)
##  7 (-902414.1 5518126)
##  8 (-904025.8 5518718)
##  9 (-898735.2 5516836)
## 10 (-900848.8 5520239)
## # … with 72 more rows
non_public_ch <- st_convex_hull(st_union(non_public_sf))
plot(non_public_ch)

plot(non_public_ch)

public_sf <- non_public_coordinates %>% 
  as_tibble() %>% 
  sf::st_as_sf(coords=c(1,2), crs = 4326)
class(public_sf)
## [1] "sf"         "tbl_df"     "tbl"        "data.frame"
public_sf <- st_transform(public_sf, 3035)

public_ch <- st_convex_hull(st_union(public_sf))
plot(public_ch)

schools <- st_convex_hull(st_union(non_public_ch, public_ch))
plot(schools)

Plot the schools with the stations points

plot(points_ch)
plot(schools, border = "yellow", lwd= 2, add = TRUE)

So the schools also take a around the same space that the neighborhoods did and the colleges and universities did

Looking at how the patterns vary over time

I need to select specific months and plot their points. I’ll look at June or July for tourist season. And then I’ll also look at the coldest month, January, for a comparison.

I first explore the number of data points in each month

cleaned_2019 %>% filter(month == 6) # June has 274,083 data points
## # A tibble: 274,083 × 12
##    tripduration starttime           stoptime            start_lat start_long
##           <dbl> <dttm>              <dttm>                  <dbl>      <dbl>
##  1          241 2019-06-01 00:00:20 2019-06-01 00:04:21      42.4      -71.1
##  2          758 2019-06-01 00:00:28 2019-06-01 00:13:06      42.4      -71.1
##  3          522 2019-06-01 00:00:48 2019-06-01 00:09:30      42.4      -71.1
##  4          474 2019-06-01 00:01:08 2019-06-01 00:09:02      42.4      -71.1
##  5         1289 2019-06-01 00:02:14 2019-06-01 00:23:43      42.3      -71.1
##  6          208 2019-06-01 00:03:43 2019-06-01 00:07:12      42.4      -71.1
##  7          297 2019-06-01 00:03:50 2019-06-01 00:08:48      42.4      -71.1
##  8         3886 2019-06-01 00:04:14 2019-06-01 01:09:00      42.3      -71.1
##  9          578 2019-06-01 00:04:16 2019-06-01 00:13:55      42.3      -71.1
## 10       141875 2019-06-01 00:04:29 2019-06-02 15:29:04      42.3      -71.1
## # … with 274,073 more rows, and 7 more variables: start_station_name <chr>,
## #   end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## #   year <dbl>, month <dbl>
cleaned_2019 %>% filter(month == 7) # July has 317,028 data points 
## # A tibble: 317,028 × 12
##    tripduration starttime           stoptime            start_lat start_long
##           <dbl> <dttm>              <dttm>                  <dbl>      <dbl>
##  1          581 2019-07-01 00:00:41 2019-07-01 00:10:22      42.3      -71.1
##  2          600 2019-07-01 00:00:49 2019-07-01 00:10:50      42.3      -71.1
##  3          349 2019-07-01 00:05:34 2019-07-01 00:11:24      42.4      -71.1
##  4        29455 2019-07-01 00:05:36 2019-07-01 08:16:31      42.3      -71.1
##  5          363 2019-07-01 00:05:45 2019-07-01 00:11:48      42.3      -71.1
##  6         1769 2019-07-01 00:06:28 2019-07-01 00:35:58      42.4      -71.1
##  7          903 2019-07-01 00:06:31 2019-07-01 00:21:34      42.3      -71.1
##  8          787 2019-07-01 00:06:35 2019-07-01 00:19:42      42.3      -71.1
##  9          315 2019-07-01 00:07:20 2019-07-01 00:12:36      42.4      -71.1
## 10          661 2019-07-01 00:08:03 2019-07-01 00:19:05      42.3      -71.1
## # … with 317,018 more rows, and 7 more variables: start_station_name <chr>,
## #   end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## #   year <dbl>, month <dbl>
# August and September has the most data points so that is also worth exploring

cleaned_2019 %>% filter(month == 1) # January has the least, 69,872
## # A tibble: 69,872 × 12
##    tripduration starttime           stoptime            start_lat start_long
##           <dbl> <dttm>              <dttm>                  <dbl>      <dbl>
##  1          371 2019-01-01 00:09:13 2019-01-01 00:15:25      42.4      -71.1
##  2          264 2019-01-01 00:33:56 2019-01-01 00:38:20      42.4      -71.1
##  3          458 2019-01-01 00:41:54 2019-01-01 00:49:33      42.4      -71.1
##  4          364 2019-01-01 00:43:32 2019-01-01 00:49:37      42.4      -71.1
##  5          681 2019-01-01 00:49:56 2019-01-01 01:01:17      42.4      -71.1
##  6          549 2019-01-01 00:50:01 2019-01-01 00:59:10      42.4      -71.1
##  7          304 2019-01-01 00:54:48 2019-01-01 00:59:53      42.4      -71.1
##  8          425 2019-01-01 01:00:48 2019-01-01 01:07:53      42.3      -71.1
##  9         1353 2019-01-01 01:03:34 2019-01-01 01:26:07      42.4      -71.1
## 10          454 2019-01-01 01:08:56 2019-01-01 01:16:30      42.3      -71.1
## # … with 69,862 more rows, and 7 more variables: start_station_name <chr>,
## #   end_lat <dbl>, end_long <dbl>, end_station_name <chr>, usertype <chr>,
## #   year <dbl>, month <dbl>

Seeing as I know that it’s mainly locals using the bikes, I’ll stick to looking at either June or July as that seems to be peak tourist season, and I therefore expect to see the biggest differences in patterns in these months.

Let’s look at July!

July_2019 <- cleaned_2019 %>% filter(month == 7)
July_2019 <- July_2019 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
# Remove outlier if it is there
July_2019 <- July_2019[!(July_2019$start_long == 0.0000|July_2019$start_lat == 0.0000), ]

# another outlier
July_2019 <- July_2019 %>% filter(start_station_name != "BCBS Hingham")

p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 

p3_popup <- paste0("<strong>Start station: </strong>", July_2019$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.077083, lat = 42.341145, zoom = 11.5) %>% 
  addCircleMarkers(lng = July_2019$start_long,
                   lat = July_2019$start_lat,
                   popup = p3_popup)

Let’s look at January!

January_2019 <- cleaned_2019 %>% filter(month == 1)
January_2019 <- January_2019 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
# Remove outlier if it is there
January_2019 <- January_2019[!(January_2019$start_long == 0.0000|January_2019$start_lat == 0.0000), ]

p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 

p4_popup <- paste0("<strong>Start station: </strong>", January_2019$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.077083, lat = 42.341145, zoom = 11.5) %>% 
  addCircleMarkers(lng = January_2019$start_long,
                   lat = January_2019$start_lat,
                   popup = p4_popup)

We can clearly see that there are fewer points in January, but other than that, it’s difficult to say anything from this map. So I’ll have to think about that. Now I’ll find the intersection between the neighborhoods and the points in July and January

First, July!

points_July <- July_2019 %>% 
  filter(! is.na(start_long)) %>% 
  filter(! is.na(start_lat))

points_July_sf <- st_as_sf(points_July, coords = c("start_long", "start_lat"), crs = 4326)

points_July_3035 <- st_transform(points_July_sf, crs = 3035)
# create convex hull
points_July_ch <- st_convex_hull(st_union(points_July_3035))

Get the intersections

# get the intersections
points_neighborhoods_intersection <- st_intersection(points_July_ch, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff_July <- st_union(points_neighborhoods_intersection)

Plot the objects together

plot(points_July_ch)
plot(points_neighborhoods_intersection, border='red', lwd=2, add = TRUE)
plot(p_n_buff_July, add = TRUE)

Again, looks very much like the general one I’ve created

Now, January!

points_January <- January_2019 %>% 
  filter(! is.na(start_long)) %>% 
  filter(! is.na(start_lat))

points_January_sf <- st_as_sf(points_January, coords = c("start_long", "start_lat"), crs = 4326)

points_January_3035 <- st_transform(points_January_sf, crs = 3035)
# create convex hull
points_January_ch <- st_convex_hull(st_union(points_January_3035))

Get the intersections

# get the intersections
points_neighborhoods_intersection <- st_intersection(points_January_ch, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff_January <- st_union(points_neighborhoods_intersection)

Plot the objects together

plot(points_January_ch)
plot(points_neighborhoods_intersection, border='red', lwd=2, add = TRUE)
plot(p_n_buff_January, add = TRUE)

The points are much more centered around the central part of Boston in January. In fact, almost the entire area the points cover, is within the neighborhoods, which is very different what what we have seen so far. this is the first finding that is different and something to mention in the report

To sum up so far. The end and start station maps look very similar. There is a clear different between the July map and the January map, where it seems that the points in January are more centralized and they are more dispersed in July. This is further supported using the st_intersection function, where much more of the polygon, created from the January points, are covered by the neighborhoods. Therefore, the users of the bikes do not travel as far outside of the city in January than they do in July.

January_2020 <- cleaned_2020 %>% filter(month == 1)
January_2020 <- January_2020 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
January_2020 <- January_2020[!(January_2020$start_long == 0.0000|January_2020$start_lat == 0.0000),]

p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 
p1_popup <- paste0("<strong>Station: </strong>", January_2020$start_station_name)

leaflet(projected_neighborhoods) %>% addTiles() %>% 
  addPolygons(stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% 
  addCircleMarkers(lng = January_2020$start_long, lat = January_2020$start_lat, popup = p1_popup) %>% setView(lng = -71.077083, lat = 42.341145, zoom = 11.5)
July_2020 <- cleaned_2020 %>% filter(month == 7)
July_2020 <- July_2020 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
July_2020 <- July_2020[!(July_2020$start_long == 0.0000|July_2020$start_lat == 0.0000),]

July_2020 <-  July_2020 %>% filter(start_station_name != "BCBS Hingham")

p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 
p1_popup <- paste0("<strong>Station: </strong>", July_2020$start_station_name)

leaflet(projected_neighborhoods) %>% addTiles() %>% 
  addPolygons(stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% 
  addCircleMarkers(lng = July_2020$start_long, lat = July_2020$start_lat, popup = p1_popup) %>% setView(lng = -71.077083, lat = 42.341145, zoom = 11.5)
points_July_2020 <- July_2020 %>% 
  filter(! is.na(start_long)) %>% 
  filter(! is.na(start_lat))

points_July_sf_2020 <- st_as_sf(points_July_2020, coords = c("start_long", "start_lat"), crs = 4326)

points_July_3035_2020 <- st_transform(points_July_sf_2020, crs = 3035)
# create convex hull
points_July_ch_2020 <- st_convex_hull(st_union(points_July_3035_2020))

Get the intersections

# get the intersections
points_neighborhoods_intersection_July_2020 <- st_intersection(points_July_ch_2020, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff_July_2020 <- st_union(points_neighborhoods_intersection_July_2020)

Plot the objects together

plot(points_July_ch_2020)
plot(points_neighborhoods_intersection_July_2020, border='red', lwd=2, add = TRUE)
plot(p_n_buff_July_2020, add = TRUE)

points_January_2020 <- January_2020 %>% 
  filter(! is.na(start_long)) %>% 
  filter(! is.na(start_lat))

points_January_sf_2020 <- st_as_sf(points_January_2020, coords = c("start_long", "start_lat"), crs = 4326)

points_January_3035_2020 <- st_transform(points_January_sf_2020, crs = 3035)
# create convex hull
points_January_ch_2020 <- st_convex_hull(st_union(points_January_3035_2020))

Get the intersections

# get the intersections
points_neighborhoods_intersection_January_2020 <- st_intersection(points_January_ch_2020, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff_January_2020 <- st_union(points_neighborhoods_intersection_January_2020)

Plot the objects together

plot(points_January_ch_2020)
plot(points_neighborhoods_intersection_January_2020, border='red', lwd=2, add = TRUE)
plot(p_n_buff_January_2020, add = TRUE)

Let’s see how the point patterns look at the peak of lockdown in Boston in 2020. From wikipedia, I can gather that the lockdown started from the end of March 2020, and the code below shows that the least data points are found in April

data_2020 %>% filter(month == 4) # 46,793 
## # A tibble: 46,793 × 18
##    tripduration starttime           stoptime            start_station_id
##           <dbl> <dttm>              <dttm>                         <dbl>
##  1          297 2020-04-01 00:12:27 2020-04-01 00:17:25               67
##  2          152 2020-04-01 00:18:18 2020-04-01 00:20:50              374
##  3          520 2020-04-01 00:18:21 2020-04-01 00:27:01               59
##  4         3917 2020-04-01 00:23:08 2020-04-01 01:28:26               97
##  5         1110 2020-04-01 00:31:45 2020-04-01 00:50:16              385
##  6         1074 2020-04-01 00:59:10 2020-04-01 01:17:04              149
##  7          214 2020-04-01 00:59:58 2020-04-01 01:03:32               54
##  8          939 2020-04-01 01:11:25 2020-04-01 01:27:04               66
##  9          776 2020-04-01 01:40:55 2020-04-01 01:53:51               66
## 10          716 2020-04-01 02:08:43 2020-04-01 02:20:39               76
## # … with 46,783 more rows, and 14 more variables: start_station_name <chr>,
## #   start_lat <dbl>, start_long <dbl>, end station id <dbl>,
## #   end_station_name <chr>, end_lat <dbl>, end_long <dbl>, bikeid <dbl>,
## #   usertype <chr>, postal code <chr>, year <dbl>, month <dbl>,
## #   birth_year <dbl>, gender <dbl>
# compared with 2019's slowest month, January with 69,872 data points 

Let’s plot the April 2020 data

April_2020 <- cleaned_2020 %>% filter(month == 4)
April_2020 <- April_2020 %>% select(c(start_long, start_lat, start_station_name)) %>% unique()
# Remove outlier if it is there
April_2020 <- April_2020[!(April_2020$start_long == 0.0000|April_2020$start_lat == 0.0000), ]

p_popup <- paste0("<strong>Neighborhood: </strong>", projected_neighborhoods$Name) 

p6_popup <- paste0("<strong>Start station: </strong>", April_2020$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.077083, lat = 42.341145, zoom = 11.5) %>% 
  addCircleMarkers(lng = April_2020$start_long,
                   lat = April_2020$start_lat,
                   popup = p4_popup)
data_2020 %>% select(c(month, start_station_name)) %>% filter(month == 4) %>% select(start_station_name) %>% unique() # 329 unique stations names in April 2020, which means that there are fewer unique stations than in January 2019, but probably not significantly so
## # A tibble: 326 × 1
##    start_station_name                                                    
##    <chr>                                                                 
##  1 MIT at Mass Ave / Amherst St                                          
##  2 Tremont St at Hamilton Pl                                             
##  3 Chinatown Gate Plaza                                                  
##  4 Harvard University River Houses at DeWolfe St / Cowperthwaite St      
##  5 Albany St at E. Brookline St                                          
##  6 175 N Harvard St                                                      
##  7 Tremont St at West St                                                 
##  8 Commonwealth Ave at Griggs St                                         
##  9 Central Sq Post Office / Cambridge City Hall at Mass Ave / Pleasant St
## 10 Gallivan Blvd at Adams St                                             
## # … with 316 more rows
# let's compare the April 2020 map with the January 2019 map
p4_popup <- paste0("<strong>Start station: </strong>", January_2019$start_station_name)

leaflet(projected_neighborhoods) %>% 
  addPolygons(
    stroke = TRUE, 
    fillColor = "yellow", 
    fillOpacity = 0.8, smoothFactor = 0.5, # to make it look nicer
    popup = p_popup) %>% # add popup 
  addTiles() %>%
  setView(lng = -71.077083, lat = 42.341145, zoom = 11.5) %>% 
  addCircleMarkers(lng = January_2019$start_long,
                   lat = January_2019$start_lat,
                   popup = p4_popup)
points_April_2020 <- April_2020 %>% 
  filter(! is.na(start_long)) %>% 
  filter(! is.na(start_lat))

points_April_sf_2020 <- st_as_sf(points_April_2020, coords = c("start_long", "start_lat"), crs = 4326)

points_April_3035_2020 <- st_transform(points_April_sf_2020, crs = 3035)
# create convex hull
points_April_ch_2020 <- st_convex_hull(st_union(points_April_3035_2020))

Get the intersections

# get the intersections
points_neighborhoods_intersection_April_2020 <- st_intersection(points_April_ch_2020, st_geometry(neighborhoods_3035))

# create buffer feature of the neighborhoods
p_n_buff_April_2020 <- st_union(points_neighborhoods_intersection_April_2020)

Plot the objects together

plot(points_April_ch_2020)
plot(points_neighborhoods_intersection_April_2020, border='red', lwd=2, add = TRUE)
plot(p_n_buff_April_2020, add = TRUE)

The points are much more centered towards to central part of Boston in January 2019 than in April 2020, where there are many more points in the southern part of the city.

compare intersection of April 2020 and January 2020

Routes

unique_end %>% select(end_station_name) %>% unique() %>%  count() # 360
## # A tibble: 1 × 1
##       n
##   <int>
## 1   360
# get the station names
stations_2019 <- table(data_2019$end_station_name)
# convert to tibble
stations_2019_df <- as.data.frame(stations_2019) %>% as_tibble()
# get the indices in ascending order of frequency
order(stations_2019_df$Freq)
##   [1]  13 237 341 233  20 232 356 348 300  82  86  88 123 150 160 236 346 247
##  [19] 187 246 234 318 309 225 222  46 155  34  14  69 260 315 344 362 214   8
##  [37]  93 162  99 156 161 343 133 154   5 306  51 258 322 355 288 193  33 330
##  [55] 165 196  39  85 311  92 313  40 332 333  47 216 223 102 114 131 146 121
##  [73] 157 335  27 159 145 104 294  41 242 182 316 274 334 310 340  98 361 273
##  [91]  26 136 125  66 254 151 134  54   2 143 277 349  31 314 281 138 204  50
## [109] 148 312  12 140 287 231   9 265  68 135 345 329  73  43 262 110 249  10
## [127]  15 336 132 158  58  67 137 181 115  32  57 279 347 363 107 224 270 295
## [145] 101  45  90 139 280 111 153 144 352 217  87 351 255 266 353 127  70  42
## [163] 198 226 124 152 243  18 177 189 186 245 290 308  49 211  17  37  72 286
## [181] 197 213 304 364 141 219  23  64 105   7  19 235  16 220 149  28 293 202
## [199]  75 221 285 184 303 194  11 215 240 188  65 167  74 164 112 350 261 239
## [217] 147 116 267 326   3  94 195 113 250 206  71 212 360 264 283 324 317 251
## [235] 106 190  52 191 185 359 122 299 176 253 305 339 296  63 272 323 208 302
## [253] 169 292 179 289 109  56 100 327 268  38 142 118 103 337 297 117   4 168
## [271]  44  35 269 259 301 271 166  91 171 126 276 199  76 338 178 358 357  77
## [289] 248 163 320   1 319  59 257  25  95 175   6 192 307 328  79  21  78 275
## [307] 203 183 325 291 128 321  81 180 174 263 342 172  55  96 119 256 129 170
## [325]  29 201 278 282 207  24 130 284 331  60  53 244 209  62 210 238 205 218
## [343]  48 354 108  80  89  36 252 120  30  83  61  97 230 200 228 173 298 229
## [361] 241  22  84 227
# select the top 36
top36_2019_end <- stations_2019_df[c(207, 24, 130, 284, 331, 60,  53, 244, 209,  62, 210, 238, 205, 218,  48, 354, 108,  80,  89,  36, 252, 120,  30,  83,  61,  97, 230, 200, 228, 173, 298, 229, 241,  22,  84, 227),]
# rename station name column
top36_2019_end <- rename(top36_2019_end,
                           "end_station_name" = "Var1")
# merge data frame with longitude and latitude coordinates from data_2019
top36_2019_end <- left_join(top36_2019_end, unique_end, by = "end_station_name")

coords_start_2019 <- top36_2019 %>% st_as_sf(coords = c("start_long", "start_lat"), crs = 4326)

coords_end_2019 <- top36_2019_end %>% st_as_sf(coords = c("end_long", "end_lat"), crs = 4326)

#lines <- st_sfc(mapply(function(a,b){
 # st_cast(st_union(a,b),"LINESTRING")}, 
  #coords_start_2019$geometry, coords_end_2019$geometry, SIMPLIFY=FALSE))
# https://stackoverflow.com/questions/65498300/how-to-efficiently-create-linestrings-from-points